{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "f64e2dff-097a-4e59-a53a-773d1c2356da",
   "metadata": {},
   "source": [
    "# `go-huggingface` Demo\n",
    "\n",
    "This demo shows how to download files and create tokenizers from HuggingFace models."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d36287c1-e5e0-4985-83ac-1f40e8b850c7",
   "metadata": {},
   "source": [
    "## Imports and `go work` setup\n",
    "\n",
    "This is used during development, to instruct the Notebook kernel [gonb](https://github.com/janpfeifer/gonb) to use the local version of the libraries."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "8941d23e-19f2-4cb4-8538-b6acecfdba61",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "**GoNB** version [v0.10.6](https://github.com/janpfeifer/gonb/releases/tag/v0.10.6) / Commit: [0e5f587a077810d058202b76a127651a02bd4382](https://github.com/janpfeifer/gonb/tree/0e5f587a077810d058202b76a127651a02bd4382)\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\t- Added replace rule for module \"github.com/gomlx/go-huggingface\" to local directory \"/home/janpf/Projects/go-huggingface\".\n",
      "\t- Added replace rule for module \"github.com/gomlx/gomlx\" to local directory \"/home/janpf/Projects/gomlx\".\n",
      "\t- Added replace rule for module \"github.com/gomlx/onnx-gomlx\" to local directory \"/home/janpf/Projects/onnx-gomlx\".\n",
      "\t- Added replace rule for module \"github.com/gomlx/gemma\" to local directory \"/home/janpf/Projects/gemma\".\n"
     ]
    }
   ],
   "source": [
    "%version\n",
    "!*rm -f go.work && go work init\n",
    "!*go work use . \"${HOME}/Projects/gomlx\" \"${HOME}/Projects/go-huggingface\" \"${HOME}/Projects/gemma\" \"${HOME}/Projects/onnx-gomlx\"\n",
    "%goworkfix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "fe1baf48-95fd-4e8c-94a5-e09f1a723559",
   "metadata": {},
   "outputs": [],
   "source": [
    "import (\n",
    "    \"github.com/janpfeifer/must\"\n",
    "    \"github.com/gomlx/go-huggingface/hub\"\n",
    "    \"github.com/gomlx/go-huggingface/tokenizers\"\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8bc02638-f04d-4fd6-a199-8cb5fec99600",
   "metadata": {},
   "source": [
    "## Download `tokenizer_config.json` and enumerate `tokenizer_class` for several models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "43f3d2af-29b2-488e-97c4-22942dcecab2",
   "metadata": {},
   "outputs": [],
   "source": [
    "var (\n",
    "    // HuggingFace authentication token read from environment.\n",
    "    // It can be created in https://huggingface.co\n",
    "    // Some files may require it for downloading.\n",
    "    hfAuthToken = os.Getenv(\"HF_TOKEN\")\n",
    "\n",
    "    // Model ids for testing.\n",
    "    hfModelIDs = []string{\n",
    "        \"google/gemma-2-2b-it\",\n",
    "        \"sentence-transformers/all-MiniLM-L6-v2\",\n",
    "        \"protectai/deberta-v3-base-zeroshot-v1-onnx\",\n",
    "        \"KnightsAnalytics/distilbert-base-uncased-finetuned-sst-2-english\",\n",
    "        \"KnightsAnalytics/distilbert-NER\",\n",
    "        \"KnightsAnalytics/all-MiniLM-L6-v2\",\n",
    "        \"SamLowe/roberta-base-go_emotions-onnx\",\n",
    "    }\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "dd4f535a-cfd7-4e10-9660-78179caa949b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "google/gemma-2-2b-it:\n",
      "\t.gitattributes\n",
      "\tREADME.md\n",
      "\tconfig.json\n",
      "\tgeneration_config.json\n",
      "\tmodel-00001-of-00002.safetensors\n",
      "\tmodel-00002-of-00002.safetensors\n",
      "\tmodel.safetensors.index.json\n",
      "\tspecial_tokens_map.json\n",
      "\ttokenizer.json\n",
      "\ttokenizer.model\n",
      "\ttokenizer_config.json\n",
      "\n",
      "sentence-transformers/all-MiniLM-L6-v2:\n",
      "\t.gitattributes\n",
      "\t1_Pooling/config.json\n",
      "\tREADME.md\n",
      "\tconfig.json\n",
      "\tconfig_sentence_transformers.json\n",
      "\tdata_config.json\n",
      "\tmodel.safetensors\n",
      "\tmodules.json\n",
      "\tonnx/model.onnx\n",
      "\tonnx/model_O1.onnx\n",
      "\tonnx/model_O2.onnx\n",
      "\tonnx/model_O3.onnx\n",
      "\tonnx/model_O4.onnx\n",
      "\tonnx/model_qint8_arm64.onnx\n",
      "\tonnx/model_qint8_avx512.onnx\n",
      "\tonnx/model_qint8_avx512_vnni.onnx\n",
      "\tonnx/model_quint8_avx2.onnx\n",
      "\topenvino/openvino_model.bin\n",
      "\topenvino/openvino_model.xml\n",
      "\topenvino/openvino_model_qint8_quantized.bin\n",
      "\topenvino/openvino_model_qint8_quantized.xml\n",
      "\tpytorch_model.bin\n",
      "\trust_model.ot\n",
      "\tsentence_bert_config.json\n",
      "\tspecial_tokens_map.json\n",
      "\ttf_model.h5\n",
      "\ttokenizer.json\n",
      "\ttokenizer_config.json\n",
      "\ttrain_script.py\n",
      "\tvocab.txt\n",
      "\n",
      "protectai/deberta-v3-base-zeroshot-v1-onnx:\n",
      "\t.gitattributes\n",
      "\tREADME.md\n",
      "\tadded_tokens.json\n",
      "\tconfig.json\n",
      "\tmerges.txt\n",
      "\tmodel.onnx\n",
      "\tspecial_tokens_map.json\n",
      "\tspm.model\n",
      "\ttokenizer.json\n",
      "\ttokenizer_config.json\n",
      "\tvocab.json\n",
      "\n",
      "KnightsAnalytics/distilbert-base-uncased-finetuned-sst-2-english:\n",
      "\t.gitattributes\n",
      "\tREADME.md\n",
      "\tconfig.json\n",
      "\tmodel.onnx\n",
      "\tspecial_tokens_map.json\n",
      "\ttokenizer.json\n",
      "\ttokenizer_config.json\n",
      "\tvocab.txt\n",
      "\n",
      "KnightsAnalytics/distilbert-NER:\n",
      "\t.gitattributes\n",
      "\tREADME.md\n",
      "\tconfig.json\n",
      "\tmodel.onnx\n",
      "\tspecial_tokens_map.json\n",
      "\ttokenizer.json\n",
      "\ttokenizer_config.json\n",
      "\tvocab.txt\n",
      "\n",
      "KnightsAnalytics/all-MiniLM-L6-v2:\n",
      "\t.gitattributes\n",
      "\tconfig.json\n",
      "\tdata_config.json\n",
      "\tmodel.onnx\n",
      "\tmodules.json\n",
      "\tspecial_tokens_map.json\n",
      "\ttokenizer.json\n",
      "\ttokenizer_config.json\n",
      "\tvocab.txt\n",
      "\n",
      "SamLowe/roberta-base-go_emotions-onnx:\n",
      "\t.gitattributes\n",
      "\tREADME.md\n",
      "\tconfig.json\n",
      "\tmerges.txt\n",
      "\tonnx/config.json\n",
      "\tonnx/merges.txt\n",
      "\tonnx/model.onnx\n",
      "\tonnx/model_quantized.onnx\n",
      "\tonnx/ort_config.json\n",
      "\tonnx/ort_config_quantized.json\n",
      "\tonnx/special_tokens_map.json\n",
      "\tonnx/tokenizer.json\n",
      "\tonnx/tokenizer_config.json\n",
      "\tonnx/vocab.json\n",
      "\tspecial_tokens_map.json\n",
      "\ttokenizer.json\n",
      "\ttokenizer_config.json\n",
      "\ttrainer_state.json\n",
      "\tvocab.json\n"
     ]
    }
   ],
   "source": [
    "%%\n",
    "for _, modelID := range hfModelIDs {\n",
    "    fmt.Printf(\"\\n%s:\\n\", modelID)\n",
    "    repo := hub.New(modelID).WithAuth(hfAuthToken)\n",
    "    for fileName, err := range repo.IterFileNames() {\n",
    "        if err != nil { panic(err) }\n",
    "        fmt.Printf(\"\\t%s\\n\", fileName)\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "80f9eaee-7507-4921-bccd-b8dbcd8bf86a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "google/gemma-2-2b-it:\n",
      "\ttokenizer_class=GemmaTokenizer\n",
      "\n",
      "sentence-transformers/all-MiniLM-L6-v2:\n",
      "\ttokenizer_class=BertTokenizer\n",
      "\n",
      "protectai/deberta-v3-base-zeroshot-v1-onnx:\n",
      "\ttokenizer_class=DebertaV2Tokenizer\n",
      "\n",
      "KnightsAnalytics/distilbert-base-uncased-finetuned-sst-2-english:\n",
      "\ttokenizer_class=DistilBertTokenizer\n",
      "\n",
      "KnightsAnalytics/distilbert-NER:\n",
      "\ttokenizer_class=DistilBertTokenizer\n",
      "\n",
      "KnightsAnalytics/all-MiniLM-L6-v2:\n",
      "\ttokenizer_class=BertTokenizer\n",
      "\n",
      "SamLowe/roberta-base-go_emotions-onnx:\n",
      "\ttokenizer_class=RobertaTokenizer\n"
     ]
    }
   ],
   "source": [
    "%%\n",
    "for _, modelID := range hfModelIDs {\n",
    "    fmt.Printf(\"\\n%s:\\n\", modelID)\n",
    "    repo := hub.New(modelID).WithAuth(hfAuthToken)\n",
    "    config := must.M1(tokenizers.GetConfig(repo))\n",
    "    fmt.Printf(\"\\ttokenizer_class=%s\\n\", config.TokenizerClass)\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3c0a7ae7-ace6-4675-873b-0336efa3c68a",
   "metadata": {},
   "source": [
    "## Create a Tokenizer\n",
    "\n",
    "### Go-only SentencePiece tokenizer:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "f37e14c8-8321-40ac-b87f-1d4a222d6123",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sentence:\tThe book is on the table.\n",
      "Tokens:  \t[651 2870 603 611 573 3037 235265]\n"
     ]
    }
   ],
   "source": [
    "var sentence = \"The book is on the table.\"\n",
    "\n",
    "%%\n",
    "repo := hub.New(\"google/gemma-2-2b-it\").WithAuth(hfAuthToken)\n",
    "tokenizer := must.M1(tokenizers.New(repo))\n",
    "tokens := tokenizer.Encode(sentence)\n",
    "fmt.Printf(\"Sentence:\\t%s\\n\", sentence)\n",
    "fmt.Printf(\"Tokens:  \\t%v\\n\", tokens)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cd2f6792-2e9b-427d-a2c6-316038624349",
   "metadata": {},
   "source": [
    "### Rust based [github.com/daulet/tokenizers](https://github.com/daulet/tokenizers) tokenizer\n",
    "\n",
    "For most tokenizers in HuggingFace though, there is no Go-only version yet, and for now we use the [github.com/daulet/tokenizers](https://github.com/daulet/tokenizers), which is based on a fast tokenizer written in Rust.\n",
    "\n",
    "It requires installation of the built Rust library though, see [github.com/daulet/tokenizers](https://github.com/daulet/tokenizers) on how to install it, they provide prebuilt binaries.\n",
    "\n",
    "> **Note**: `daulet/tokenizers` also provides a simple downloader, so `go-huggingface` is not strictly necessary -- if you don't want the extra dependency and only need the tokenizer, you don't need to use it. `go-huggingface` helps by allowing also downloading other files (models, datasets), and a shared cache across different projects and `huggingface-hub` (the python downloader library)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "cd706316-ef19-4f25-92dc-a1283af8987d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sentence:\tThe book is on the table.\n",
      "Tokens:  \t[101 1996 2338 2003 2006 1996 2795 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n"
     ]
    }
   ],
   "source": [
    "import dtok \"github.com/daulet/tokenizers\"\n",
    "\n",
    "%%\n",
    "modelID := \"KnightsAnalytics/all-MiniLM-L6-v2\"\n",
    "repo := hub.New(modelID).WithAuth(hfAuthToken)\n",
    "localFile := must.M1(repo.DownloadFile(\"tokenizer.json\"))\n",
    "tokenizer := must.M1(dtok.FromFile(localFile))\n",
    "defer tokenizer.Close()\n",
    "tokens, _ := tokenizer.Encode(sentence, true)\n",
    "\n",
    "fmt.Printf(\"Sentence:\\t%s\\n\", sentence)\n",
    "fmt.Printf(\"Tokens:  \\t%v\\n\", tokens)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "012cda0f-5ed7-418b-a7eb-3de5040a7e2c",
   "metadata": {},
   "source": [
    "## Convert ONNX model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "7e7751da-e53f-47c6-a7d6-e6b760f95417",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sentences: \t[\"This is an example sentence\" \"Each sentence is converted\"]\n",
      "Embeddings:\t[2][7][384]float32{\n",
      " {{0.0366, -0.0162, 0.1682, ..., 0.0554, -0.1644, -0.2967},\n",
      "  {0.7239, 0.6399, 0.1888, ..., 0.5946, 0.6206, 0.4897},\n",
      "  {0.0064, 0.0203, 0.0448, ..., 0.3464, 1.3170, -0.1670},\n",
      "  ...,\n",
      "  {0.1479, -0.0643, 0.1457, ..., 0.8837, -0.3316, 0.2975},\n",
      "  {0.5212, 0.6563, 0.5607, ..., -0.0399, 0.0412, -1.4036},\n",
      "  {1.0824, 0.7140, 0.3986, ..., -0.2301, 0.3243, -1.0313}},\n",
      " {{0.2802, 0.1165, -0.0418, ..., 0.2711, -0.1685, -0.2961},\n",
      "  {0.8729, 0.4545, -0.1091, ..., 0.1365, 0.4580, -0.2042},\n",
      "  {0.4752, 0.5731, 0.6304, ..., 0.6526, 0.5612, -1.3268},\n",
      "  ...,\n",
      "  {0.6113, 0.7920, -0.4685, ..., 0.0854, 1.0592, -0.2983},\n",
      "  {0.4115, 1.0946, 0.2385, ..., 0.8984, 0.3684, -0.7333},\n",
      "  {0.1374, 0.5555, 0.2678, ..., 0.5426, 0.4665, -0.5284}}}\n"
     ]
    }
   ],
   "source": [
    "import (\n",
    "    \"github.com/gomlx/onnx-gomlx/onnx\"\n",
    "    \"github.com/gomlx/gomlx/graph\"\n",
    "    \"github.com/gomlx/gomlx/ml/context\"\n",
    "    \"github.com/gomlx/gomlx/backends\"\n",
    "    _ \"github.com/gomlx/gomlx/backends/xla\"\n",
    ")\n",
    "\n",
    "%%\n",
    "// Get ONNX model.\n",
    "repo := hub.New(\"sentence-transformers/all-MiniLM-L6-v2\").WithAuth(hfAuthToken)\n",
    "onnxFilePath, err := repo.DownloadFile(\"onnx/model.onnx\")\n",
    "if err != nil { panic(err) }\n",
    "onnxModel, err := onnx.ReadFile(onnxFilePath)\n",
    "if err != nil { panic(err) }\n",
    "\n",
    "// Convert ONNX variables to GoMLX context (which stores variables):\n",
    "ctx := context.New()\n",
    "err = onnxModel.VariablesToContext(ctx)\n",
    "if err != nil { panic(err) }\n",
    "\n",
    "sentences := []string{\n",
    "    \"This is an example sentence\", \n",
    "    \"Each sentence is converted\"}\n",
    "inputIDs := [][]int64{\n",
    "    {101, 2023, 2003, 2019, 2742, 6251,  102},\n",
    "    { 101, 2169, 6251, 2003, 4991,  102,    0}}\n",
    "tokenTypeIDs := [][]int64{\n",
    "    {0, 0, 0, 0, 0, 0, 0},\n",
    "    {0, 0, 0, 0, 0, 0, 0}}\n",
    "attentionMask := [][]int64{\n",
    "    {1, 1, 1, 1, 1, 1, 1},\n",
    "    {1, 1, 1, 1, 1, 1, 0}}\n",
    "embeddings := context.ExecOnce(\n",
    "    backends.New(), ctx, \n",
    "    func (ctx *context.Context, inputs []*graph.Node) *graph.Node {\n",
    "        modelOutputs := onnxModel.CallGraph(ctx, inputs[0].Graph(), map[string]*graph.Node{\n",
    "            \"input_ids\": inputs[0],\n",
    "            \"attention_mask\": inputs[1],\n",
    "            \"token_type_ids\": inputs[2]})\n",
    "        return modelOutputs[0]\n",
    "    }, inputIDs, attentionMask, tokenTypeIDs)\n",
    "\n",
    "fmt.Printf(\"Sentences: \\t%q\\n\", sentences)\n",
    "fmt.Printf(\"Embeddings:\\t%s\\n\", embeddings)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f4ee266a-f0ee-48e0-ad2d-a5bd85a47043",
   "metadata": {},
   "source": [
    "## Download Dataset Files\n",
    "\n",
    "We are going to use the [HuggingFaceFW/fineweb](https://huggingface.co/datasets/HuggingFaceFW/fineweb) as an example, download one of its sample files (~2.5Gb of data) and parse the `.parquet` file.\n",
    "\n",
    "### Structure of file\n",
    "First we define the structure of each entry, with the tags for the Parquet parser:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "3963f645-a63b-43da-9c9d-3340a330fca7",
   "metadata": {},
   "outputs": [],
   "source": [
    "var (\n",
    "    FineWebID = \"HuggingFaceFW/fineweb\"\n",
    "    FineWebSampleFile = \"sample/10BT/000_00000.parquet\"\n",
    ")\n",
    "\n",
    "// FineWebEntry: inspection of fields in parque file done with tool in \n",
    "// github.com/xitongsys/parquet-go/tool/parquet-tools.\n",
    "//\n",
    "// The parquet annotations are described in: https://pkg.go.dev/github.com/parquet-go/parquet-go#SchemaOf\n",
    "type FineWebEntry struct {\n",
    "    Text string `parquet:\"text,snappy\"`\n",
    "    ID string `parquet:\"id,snappy\"`\n",
    "    Dump string `parquet:\"dump,snappy\"`\n",
    "    URL string `parquet:\"url,snappy\"`\n",
    "    Score float64 `parquet:\"language_score\"`\n",
    "}\n",
    "\n",
    "// TrimString returns s trimmed to at most maxLength runes. If trimmed it appends \"…\" at the end.\n",
    "func TrimString(s string, maxLength int) string {\n",
    "    if utf8.RuneCountInString(s) <= maxLength {\n",
    "        return s\n",
    "    }\n",
    "    runes := []rune(s)\n",
    "    return string(runes[:maxLength-1]) + \"…\"\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6a0c90c8-ba8b-4182-92f2-4f7921f8a4f6",
   "metadata": {},
   "source": [
    "### Read the Parquet\n",
    "\n",
    "Using the library [github.com/parquet-go/parquet-go](https://github.com/parquet-go/parquet-go)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "bc2f3084-05fd-450b-b939-9095234fb225",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10 rows read\n",
      "Row 0:\tScore=0.823 Text=[\"|Viewing Single Post From: Spoilers for the Week …\"], URL=[http://daytimeroyaltyonline.com/single/…]\n",
      "Row 1:\tScore=0.974 Text=[\"*sigh* Fundamentalist community, let me pass on s…\"], URL=[http://endogenousretrovirus.blogspot.co…]\n",
      "Row 2:\tScore=0.873 Text=[\"A novel two-step immunotherapy approach has shown…\"], URL=[http://news.cancerconnect.com/]\n",
      "Row 3:\tScore=0.932 Text=[\"Free the Cans! Working Together to Reduce Waste\\nI…\"], URL=[http://sharingsolution.com/2009/05/23/f…]\n",
      "Row 4:\tScore=0.955 Text=[\"ORLANDO, Fla. — While the Rapid Recall Exchange, …\"], URL=[http://supermarketnews.com/food-safety/…]\n",
      "Row 5:\tScore=0.954 Text=[\"September 28, 2010\\n2010 Season - Bowman pulls dow…\"], URL=[http://www.augustana.edu/x22236.xml]\n",
      "Row 6:\tScore=0.967 Text=[\"Kraft Foods has taken the Cadbury chocolate brand…\"], URL=[http://www.fdin.org.uk/2012/01/kraft-la…]\n",
      "Row 7:\tScore=0.874 Text=[\"You must be a registered member to view this page…\"], URL=[http://www.golivewire.com/forums/profil…]\n",
      "Row 8:\tScore=0.912 Text=[\"|Facility Type:||Full Service Restaurant|\\n|Inspec…\"], URL=[http://www.healthspace.com/Clients/VDH/…]\n",
      "Row 9:\tScore=0.925 Text=[\"News of the Week\\nBarrie Spring Studio Tour\\nApril …\"], URL=[http://www.jillpricestudios.ca/artist/w…]\n"
     ]
    }
   ],
   "source": [
    "import (\n",
    "    parquet \"github.com/parquet-go/parquet-go\"\n",
    ")\n",
    "\n",
    "%%\n",
    "// Download repo file.\n",
    "repo := hub.New(FineWebID).WithType(hub.RepoTypeDataset).WithAuth(hfAuthToken)\n",
    "localSampleFile := must.M1(repo.DownloadFile(FineWebSampleFile))\n",
    "\n",
    "// Parquet reading using parquet-go: it's somewhat cumbersome (to open the file it needs its size!?), but it works.\n",
    "schema := parquet.SchemaOf(&FineWebEntry{})\n",
    "fSize := must.M1(os.Stat(localSampleFile)).Size()\n",
    "fReader := must.M1(os.Open(localSampleFile))\n",
    "fParquet := must.M1(parquet.OpenFile(fReader, fSize))\n",
    "reader := parquet.NewGenericReader[FineWebEntry](fParquet, schema)\n",
    "defer reader.Close()\n",
    "\n",
    "// Print first 10 rows:\n",
    "rows := make([]FineWebEntry, 10)\n",
    "n := must.M1(reader.Read(rows))\n",
    "fmt.Printf(\"%d rows read\\n\", n)\n",
    "for ii, row := range rows {\n",
    "    fmt.Printf(\"Row %0d:\\tScore=%.3f Text=[%q], URL=[%s]\\n\", ii, row.Score, TrimString(row.Text, 50), TrimString(row.URL, 40))\n",
    "}\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Go (gonb)",
   "language": "go",
   "name": "gonb"
  },
  "language_info": {
   "codemirror_mode": "",
   "file_extension": ".go",
   "mimetype": "",
   "name": "go",
   "nbconvert_exporter": "",
   "pygments_lexer": "",
   "version": "go1.23.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
